R(2+1)D Video Action Recognition
!git clone https://github.com/microsoft/computervision-recipes.git
%cd computervision-recipes
!pip install decord ipywebrtc einops
%cd /content/video_input/
from pytube import YouTube
# YouTube('https://www.youtube.com/watch?v=9P7JzTRHz5g').streams.first().download()
# YouTube('https://www.youtube.com/watch?v=0Cl_Q8RjmfI').streams.first().download()
# YouTube('https://www.youtube.com/watch?v=k2eCJ2XI1IA').streams.first().download()
# YouTube('https://www.youtube.com/watch?v=6mhRTDBNQ-M').streams.first().download()
# YouTube('https://www.youtube.com/watch?v=xm9c5HAUBpY').streams.first().download()
# YouTube('https://www.youtube.com/watch?v=K1FPxvdB_to').streams.first().download()
import sys
from collections import deque #
import io
import requests
import os
from time import sleep, time
from threading import Thread
from IPython.display import HTML
from base64 import b64encode
# Third party tools
import decord #
import IPython.display #
from ipywebrtc import CameraStream, ImageRecorder
from ipywidgets import HBox, HTML, Layout, VBox, Widget, Label
import numpy as np
from PIL import Image
import torch
import torch.cuda as cuda
import torch.nn as nn
from torchvision.transforms import Compose
# utils_cv
sys.path.append("/content/computervision-recipes")
from utils_cv.action_recognition.data import KINETICS, Urls
from utils_cv.action_recognition.dataset import get_transforms
from utils_cv.action_recognition.model import VideoLearner
from utils_cv.action_recognition.references import transforms_video as transforms
from utils_cv.common.gpu import system_info, torch_device
from utils_cv.common.data import data_path
%reload_ext autoreload
%autoreload 2
system_info()
NUM_FRAMES = 8 # 8 or 32.
IM_SCALE = 128 # resize then crop
INPUT_SIZE = 112 # input clip size: 3 x NUM_FRAMES x 112 x 112
# video sample to download
sample_video_url = Urls.webcam_vid
# file path to save video sample
video_fpath = data_path() / "sample_video.mp4"
# prediction score threshold
SCORE_THRESHOLD = 0.01
# Averaging 5 latest clips to make video-level prediction (or smoothing)
AVERAGING_SIZE = 5
learner = VideoLearner(base_model="kinetics", sample_length=NUM_FRAMES)
LABELS = KINETICS.class_names
LABELS[:10]
TARGET_LABELS = [
"assembling computer",
"applying cream",
"brushing teeth",
"clapping",
"cleaning floor",
"cleaning windows",
"drinking",
"eating burger",
"eating chips",
"eating doughnuts",
"eating hotdog",
"eating ice cream",
"fixing hair",
"hammer throw",
"high kick",
"jogging",
"laughing",
"mopping floor",
"moving furniture",
"opening bottle",
"plastering",
"punching bag",
"punching person (boxing)",
"pushing cart",
"reading book",
"reading newspaper",
"rock scissors paper",
"running on treadmill",
"shaking hands",
"shaking head",
"side kick",
"slapping",
"smoking",
"sneezing",
"spray painting",
"spraying",
"stretching arm",
"stretching leg",
"sweeping floor",
"swinging legs",
"texting",
"throwing axe",
"throwing ball",
"unboxing",
"unloading truck",
"using computer",
"using remote controller (not gaming)",
"welding",
"writing",
"yawning",
]
len(TARGET_LABELS)
# path = 'video.mp4'
# mp4 = open(path,'rb').read()
# data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
# HTML("""<video width=400 controls><source src="%s" type="video/mp4"></video>""" % data_url)